1 package org.apache.lucene.codecs.lucene42;
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 import java.io.Closeable;
21 import java.io.IOException;
22 import java.util.Collection;
23 import java.util.Collections;
24 import java.util.Iterator;
25 import java.util.NoSuchElementException;
26
27 import org.apache.lucene.codecs.CodecUtil;
28 import org.apache.lucene.codecs.TermVectorsReader;
29 import org.apache.lucene.codecs.compressing.CompressionMode;
30 import org.apache.lucene.codecs.compressing.Decompressor;
31 import org.apache.lucene.codecs.lucene41.Lucene41StoredFieldsIndexReader;
32 import org.apache.lucene.index.CorruptIndexException;
33 import org.apache.lucene.index.DocsAndPositionsEnum;
34 import org.apache.lucene.index.FieldInfo;
35 import org.apache.lucene.index.FieldInfos;
36 import org.apache.lucene.index.Fields;
37 import org.apache.lucene.index.IndexFileNames;
38 import org.apache.lucene.index.PostingsEnum;
39 import org.apache.lucene.index.SegmentInfo;
40 import org.apache.lucene.index.Terms;
41 import org.apache.lucene.index.TermsEnum;
42 import org.apache.lucene.store.AlreadyClosedException;
43 import org.apache.lucene.store.ByteArrayDataInput;
44 import org.apache.lucene.store.ChecksumIndexInput;
45 import org.apache.lucene.store.Directory;
46 import org.apache.lucene.store.IOContext;
47 import org.apache.lucene.store.IndexInput;
48 import org.apache.lucene.util.Accountable;
49 import org.apache.lucene.util.Accountables;
50 import org.apache.lucene.util.ArrayUtil;
51 import org.apache.lucene.util.BytesRef;
52 import org.apache.lucene.util.IOUtils;
53 import org.apache.lucene.util.LongsRef;
54 import org.apache.lucene.util.packed.BlockPackedReaderIterator;
55 import org.apache.lucene.util.packed.PackedInts;
56
57
58
59
60
61 @Deprecated
62 final class Lucene42TermVectorsReader extends TermVectorsReader implements Closeable {
63
64 private final FieldInfos fieldInfos;
65 final Lucene41StoredFieldsIndexReader indexReader;
66 final IndexInput vectorsStream;
67 private final int version;
68 private final int packedIntsVersion;
69 private final CompressionMode compressionMode;
70 private final Decompressor decompressor;
71 private final int chunkSize;
72 private final int numDocs;
73 private boolean closed;
74 private final BlockPackedReaderIterator reader;
75
76 static final String VECTORS_EXTENSION = "tvd";
77 static final String VECTORS_INDEX_EXTENSION = "tvx";
78
79 static final String CODEC_SFX_IDX = "Index";
80 static final String CODEC_SFX_DAT = "Data";
81
82 static final int VERSION_START = 0;
83 static final int VERSION_CHECKSUM = 1;
84 static final int VERSION_CURRENT = VERSION_CHECKSUM;
85
86 static final int BLOCK_SIZE = 64;
87
88 static final int POSITIONS = 0x01;
89 static final int OFFSETS = 0x02;
90 static final int PAYLOADS = 0x04;
91 static final int FLAGS_BITS = PackedInts.bitsRequired(POSITIONS | OFFSETS | PAYLOADS);
92
93
94 private Lucene42TermVectorsReader(Lucene42TermVectorsReader reader) {
95 this.fieldInfos = reader.fieldInfos;
96 this.vectorsStream = reader.vectorsStream.clone();
97 this.indexReader = reader.indexReader.clone();
98 this.packedIntsVersion = reader.packedIntsVersion;
99 this.compressionMode = reader.compressionMode;
100 this.decompressor = reader.decompressor.clone();
101 this.chunkSize = reader.chunkSize;
102 this.numDocs = reader.numDocs;
103 this.reader = new BlockPackedReaderIterator(vectorsStream, packedIntsVersion, BLOCK_SIZE, 0);
104 this.version = reader.version;
105 this.closed = false;
106 }
107
108
109 public Lucene42TermVectorsReader(Directory d, SegmentInfo si, String segmentSuffix, FieldInfos fn,
110 IOContext context, String formatName, CompressionMode compressionMode) throws IOException {
111 this.compressionMode = compressionMode;
112 final String segment = si.name;
113 boolean success = false;
114 fieldInfos = fn;
115 numDocs = si.maxDoc();
116 ChecksumIndexInput indexStream = null;
117 try {
118
119 final String indexStreamFN = IndexFileNames.segmentFileName(segment, segmentSuffix, VECTORS_INDEX_EXTENSION);
120 indexStream = d.openChecksumInput(indexStreamFN, context);
121 final String codecNameIdx = formatName + CODEC_SFX_IDX;
122 version = CodecUtil.checkHeader(indexStream, codecNameIdx, VERSION_START, VERSION_CURRENT);
123 assert CodecUtil.headerLength(codecNameIdx) == indexStream.getFilePointer();
124 indexReader = new Lucene41StoredFieldsIndexReader(indexStream, si);
125
126 if (version >= VERSION_CHECKSUM) {
127 indexStream.readVLong();
128 CodecUtil.checkFooter(indexStream);
129 } else {
130 CodecUtil.checkEOF(indexStream);
131 }
132 indexStream.close();
133 indexStream = null;
134
135
136 final String vectorsStreamFN = IndexFileNames.segmentFileName(segment, segmentSuffix, VECTORS_EXTENSION);
137 vectorsStream = d.openInput(vectorsStreamFN, context);
138 final String codecNameDat = formatName + CODEC_SFX_DAT;
139 int version2 = CodecUtil.checkHeader(vectorsStream, codecNameDat, VERSION_START, VERSION_CURRENT);
140 if (version != version2) {
141 throw new CorruptIndexException("Version mismatch between stored fields index and data: " + version + " != " + version2, vectorsStream);
142 }
143 assert CodecUtil.headerLength(codecNameDat) == vectorsStream.getFilePointer();
144
145 long pos = vectorsStream.getFilePointer();
146 if (version >= VERSION_CHECKSUM) {
147
148
149
150
151 CodecUtil.retrieveChecksum(vectorsStream);
152 vectorsStream.seek(pos);
153 }
154
155 packedIntsVersion = vectorsStream.readVInt();
156 chunkSize = vectorsStream.readVInt();
157 decompressor = compressionMode.newDecompressor();
158 this.reader = new BlockPackedReaderIterator(vectorsStream, packedIntsVersion, BLOCK_SIZE, 0);
159
160 success = true;
161 } finally {
162 if (!success) {
163 IOUtils.closeWhileHandlingException(this, indexStream);
164 }
165 }
166 }
167
168
169
170
171 private void ensureOpen() throws AlreadyClosedException {
172 if (closed) {
173 throw new AlreadyClosedException("this FieldsReader is closed");
174 }
175 }
176
177 @Override
178 public void close() throws IOException {
179 if (!closed) {
180 IOUtils.close(vectorsStream);
181 closed = true;
182 }
183 }
184
185 @Override
186 public TermVectorsReader clone() {
187 return new Lucene42TermVectorsReader(this);
188 }
189
190 @Override
191 public Fields get(int doc) throws IOException {
192 ensureOpen();
193
194
195 {
196 final long startPointer = indexReader.getStartPointer(doc);
197 vectorsStream.seek(startPointer);
198 }
199
200
201
202
203 final int docBase = vectorsStream.readVInt();
204 final int chunkDocs = vectorsStream.readVInt();
205 if (doc < docBase || doc >= docBase + chunkDocs || docBase + chunkDocs > numDocs) {
206 throw new CorruptIndexException("docBase=" + docBase + ",chunkDocs=" + chunkDocs + ",doc=" + doc, vectorsStream);
207 }
208
209 final int skip;
210 final int numFields;
211 final int totalFields;
212 if (chunkDocs == 1) {
213 skip = 0;
214 numFields = totalFields = vectorsStream.readVInt();
215 } else {
216 reader.reset(vectorsStream, chunkDocs);
217 int sum = 0;
218 for (int i = docBase; i < doc; ++i) {
219 sum += reader.next();
220 }
221 skip = sum;
222 numFields = (int) reader.next();
223 sum += numFields;
224 for (int i = doc + 1; i < docBase + chunkDocs; ++i) {
225 sum += reader.next();
226 }
227 totalFields = sum;
228 }
229
230 if (numFields == 0) {
231
232 return null;
233 }
234
235
236 final int[] fieldNums;
237 {
238 final int token = vectorsStream.readByte() & 0xFF;
239 assert token != 0;
240 final int bitsPerFieldNum = token & 0x1F;
241 int totalDistinctFields = token >>> 5;
242 if (totalDistinctFields == 0x07) {
243 totalDistinctFields += vectorsStream.readVInt();
244 }
245 ++totalDistinctFields;
246 final PackedInts.ReaderIterator it = PackedInts.getReaderIteratorNoHeader(vectorsStream, PackedInts.Format.PACKED, packedIntsVersion, totalDistinctFields, bitsPerFieldNum, 1);
247 fieldNums = new int[totalDistinctFields];
248 for (int i = 0; i < totalDistinctFields; ++i) {
249 fieldNums[i] = (int) it.next();
250 }
251 }
252
253
254 final int[] fieldNumOffs = new int[numFields];
255 final PackedInts.Reader flags;
256 {
257 final int bitsPerOff = PackedInts.bitsRequired(fieldNums.length - 1);
258 final PackedInts.Reader allFieldNumOffs = PackedInts.getReaderNoHeader(vectorsStream, PackedInts.Format.PACKED, packedIntsVersion, totalFields, bitsPerOff);
259 switch (vectorsStream.readVInt()) {
260 case 0:
261 final PackedInts.Reader fieldFlags = PackedInts.getReaderNoHeader(vectorsStream, PackedInts.Format.PACKED, packedIntsVersion, fieldNums.length, FLAGS_BITS);
262 PackedInts.Mutable f = PackedInts.getMutable(totalFields, FLAGS_BITS, PackedInts.COMPACT);
263 for (int i = 0; i < totalFields; ++i) {
264 final int fieldNumOff = (int) allFieldNumOffs.get(i);
265 assert fieldNumOff >= 0 && fieldNumOff < fieldNums.length;
266 final int fgs = (int) fieldFlags.get(fieldNumOff);
267 f.set(i, fgs);
268 }
269 flags = f;
270 break;
271 case 1:
272 flags = PackedInts.getReaderNoHeader(vectorsStream, PackedInts.Format.PACKED, packedIntsVersion, totalFields, FLAGS_BITS);
273 break;
274 default:
275 throw new AssertionError();
276 }
277 for (int i = 0; i < numFields; ++i) {
278 fieldNumOffs[i] = (int) allFieldNumOffs.get(skip + i);
279 }
280 }
281
282
283 final PackedInts.Reader numTerms;
284 final int totalTerms;
285 {
286 final int bitsRequired = vectorsStream.readVInt();
287 numTerms = PackedInts.getReaderNoHeader(vectorsStream, PackedInts.Format.PACKED, packedIntsVersion, totalFields, bitsRequired);
288 int sum = 0;
289 for (int i = 0; i < totalFields; ++i) {
290 sum += numTerms.get(i);
291 }
292 totalTerms = sum;
293 }
294
295
296 int docOff = 0, docLen = 0, totalLen;
297 final int[] fieldLengths = new int[numFields];
298 final int[][] prefixLengths = new int[numFields][];
299 final int[][] suffixLengths = new int[numFields][];
300 {
301 reader.reset(vectorsStream, totalTerms);
302
303 int toSkip = 0;
304 for (int i = 0; i < skip; ++i) {
305 toSkip += numTerms.get(i);
306 }
307 reader.skip(toSkip);
308
309 for (int i = 0; i < numFields; ++i) {
310 final int termCount = (int) numTerms.get(skip + i);
311 final int[] fieldPrefixLengths = new int[termCount];
312 prefixLengths[i] = fieldPrefixLengths;
313 for (int j = 0; j < termCount; ) {
314 final LongsRef next = reader.next(termCount - j);
315 for (int k = 0; k < next.length; ++k) {
316 fieldPrefixLengths[j++] = (int) next.longs[next.offset + k];
317 }
318 }
319 }
320 reader.skip(totalTerms - reader.ord());
321
322 reader.reset(vectorsStream, totalTerms);
323
324 toSkip = 0;
325 for (int i = 0; i < skip; ++i) {
326 for (int j = 0; j < numTerms.get(i); ++j) {
327 docOff += reader.next();
328 }
329 }
330 for (int i = 0; i < numFields; ++i) {
331 final int termCount = (int) numTerms.get(skip + i);
332 final int[] fieldSuffixLengths = new int[termCount];
333 suffixLengths[i] = fieldSuffixLengths;
334 for (int j = 0; j < termCount; ) {
335 final LongsRef next = reader.next(termCount - j);
336 for (int k = 0; k < next.length; ++k) {
337 fieldSuffixLengths[j++] = (int) next.longs[next.offset + k];
338 }
339 }
340 fieldLengths[i] = sum(suffixLengths[i]);
341 docLen += fieldLengths[i];
342 }
343 totalLen = docOff + docLen;
344 for (int i = skip + numFields; i < totalFields; ++i) {
345 for (int j = 0; j < numTerms.get(i); ++j) {
346 totalLen += reader.next();
347 }
348 }
349 }
350
351
352 final int[] termFreqs = new int[totalTerms];
353 {
354 reader.reset(vectorsStream, totalTerms);
355 for (int i = 0; i < totalTerms; ) {
356 final LongsRef next = reader.next(totalTerms - i);
357 for (int k = 0; k < next.length; ++k) {
358 termFreqs[i++] = 1 + (int) next.longs[next.offset + k];
359 }
360 }
361 }
362
363
364 int totalPositions = 0, totalOffsets = 0, totalPayloads = 0;
365 for (int i = 0, termIndex = 0; i < totalFields; ++i) {
366 final int f = (int) flags.get(i);
367 final int termCount = (int) numTerms.get(i);
368 for (int j = 0; j < termCount; ++j) {
369 final int freq = termFreqs[termIndex++];
370 if ((f & POSITIONS) != 0) {
371 totalPositions += freq;
372 }
373 if ((f & OFFSETS) != 0) {
374 totalOffsets += freq;
375 }
376 if ((f & PAYLOADS) != 0) {
377 totalPayloads += freq;
378 }
379 }
380 assert i != totalFields - 1 || termIndex == totalTerms : termIndex + " " + totalTerms;
381 }
382
383 final int[][] positionIndex = positionIndex(skip, numFields, numTerms, termFreqs);
384 final int[][] positions, startOffsets, lengths;
385 if (totalPositions > 0) {
386 positions = readPositions(skip, numFields, flags, numTerms, termFreqs, POSITIONS, totalPositions, positionIndex);
387 } else {
388 positions = new int[numFields][];
389 }
390
391 if (totalOffsets > 0) {
392
393 final float[] charsPerTerm = new float[fieldNums.length];
394 for (int i = 0; i < charsPerTerm.length; ++i) {
395 charsPerTerm[i] = Float.intBitsToFloat(vectorsStream.readInt());
396 }
397 startOffsets = readPositions(skip, numFields, flags, numTerms, termFreqs, OFFSETS, totalOffsets, positionIndex);
398 lengths = readPositions(skip, numFields, flags, numTerms, termFreqs, OFFSETS, totalOffsets, positionIndex);
399
400 for (int i = 0; i < numFields; ++i) {
401 final int[] fStartOffsets = startOffsets[i];
402 final int[] fPositions = positions[i];
403
404 if (fStartOffsets != null && fPositions != null) {
405 final float fieldCharsPerTerm = charsPerTerm[fieldNumOffs[i]];
406 for (int j = 0; j < startOffsets[i].length; ++j) {
407 fStartOffsets[j] += (int) (fieldCharsPerTerm * fPositions[j]);
408 }
409 }
410 if (fStartOffsets != null) {
411 final int[] fPrefixLengths = prefixLengths[i];
412 final int[] fSuffixLengths = suffixLengths[i];
413 final int[] fLengths = lengths[i];
414 for (int j = 0, end = (int) numTerms.get(skip + i); j < end; ++j) {
415
416 final int termLength = fPrefixLengths[j] + fSuffixLengths[j];
417 lengths[i][positionIndex[i][j]] += termLength;
418 for (int k = positionIndex[i][j] + 1; k < positionIndex[i][j + 1]; ++k) {
419 fStartOffsets[k] += fStartOffsets[k - 1];
420 fLengths[k] += termLength;
421 }
422 }
423 }
424 }
425 } else {
426 startOffsets = lengths = new int[numFields][];
427 }
428 if (totalPositions > 0) {
429
430 for (int i = 0; i < numFields; ++i) {
431 final int[] fPositions = positions[i];
432 final int[] fpositionIndex = positionIndex[i];
433 if (fPositions != null) {
434 for (int j = 0, end = (int) numTerms.get(skip + i); j < end; ++j) {
435
436 for (int k = fpositionIndex[j] + 1; k < fpositionIndex[j + 1]; ++k) {
437 fPositions[k] += fPositions[k - 1];
438 }
439 }
440 }
441 }
442 }
443
444
445 final int[][] payloadIndex = new int[numFields][];
446 int totalPayloadLength = 0;
447 int payloadOff = 0;
448 int payloadLen = 0;
449 if (totalPayloads > 0) {
450 reader.reset(vectorsStream, totalPayloads);
451
452 int termIndex = 0;
453 for (int i = 0; i < skip; ++i) {
454 final int f = (int) flags.get(i);
455 final int termCount = (int) numTerms.get(i);
456 if ((f & PAYLOADS) != 0) {
457 for (int j = 0; j < termCount; ++j) {
458 final int freq = termFreqs[termIndex + j];
459 for (int k = 0; k < freq; ++k) {
460 final int l = (int) reader.next();
461 payloadOff += l;
462 }
463 }
464 }
465 termIndex += termCount;
466 }
467 totalPayloadLength = payloadOff;
468
469 for (int i = 0; i < numFields; ++i) {
470 final int f = (int) flags.get(skip + i);
471 final int termCount = (int) numTerms.get(skip + i);
472 if ((f & PAYLOADS) != 0) {
473 final int totalFreq = positionIndex[i][termCount];
474 payloadIndex[i] = new int[totalFreq + 1];
475 int posIdx = 0;
476 payloadIndex[i][posIdx] = payloadLen;
477 for (int j = 0; j < termCount; ++j) {
478 final int freq = termFreqs[termIndex + j];
479 for (int k = 0; k < freq; ++k) {
480 final int payloadLength = (int) reader.next();
481 payloadLen += payloadLength;
482 payloadIndex[i][posIdx+1] = payloadLen;
483 ++posIdx;
484 }
485 }
486 assert posIdx == totalFreq;
487 }
488 termIndex += termCount;
489 }
490 totalPayloadLength += payloadLen;
491 for (int i = skip + numFields; i < totalFields; ++i) {
492 final int f = (int) flags.get(i);
493 final int termCount = (int) numTerms.get(i);
494 if ((f & PAYLOADS) != 0) {
495 for (int j = 0; j < termCount; ++j) {
496 final int freq = termFreqs[termIndex + j];
497 for (int k = 0; k < freq; ++k) {
498 totalPayloadLength += reader.next();
499 }
500 }
501 }
502 termIndex += termCount;
503 }
504 assert termIndex == totalTerms : termIndex + " " + totalTerms;
505 }
506
507
508 final BytesRef suffixBytes = new BytesRef();
509 decompressor.decompress(vectorsStream, totalLen + totalPayloadLength, docOff + payloadOff, docLen + payloadLen, suffixBytes);
510 suffixBytes.length = docLen;
511 final BytesRef payloadBytes = new BytesRef(suffixBytes.bytes, suffixBytes.offset + docLen, payloadLen);
512
513 final int[] fieldFlags = new int[numFields];
514 for (int i = 0; i < numFields; ++i) {
515 fieldFlags[i] = (int) flags.get(skip + i);
516 }
517
518 final int[] fieldNumTerms = new int[numFields];
519 for (int i = 0; i < numFields; ++i) {
520 fieldNumTerms[i] = (int) numTerms.get(skip + i);
521 }
522
523 final int[][] fieldTermFreqs = new int[numFields][];
524 {
525 int termIdx = 0;
526 for (int i = 0; i < skip; ++i) {
527 termIdx += numTerms.get(i);
528 }
529 for (int i = 0; i < numFields; ++i) {
530 final int termCount = (int) numTerms.get(skip + i);
531 fieldTermFreqs[i] = new int[termCount];
532 for (int j = 0; j < termCount; ++j) {
533 fieldTermFreqs[i][j] = termFreqs[termIdx++];
534 }
535 }
536 }
537
538 assert sum(fieldLengths) == docLen : sum(fieldLengths) + " != " + docLen;
539
540 return new TVFields(fieldNums, fieldFlags, fieldNumOffs, fieldNumTerms, fieldLengths,
541 prefixLengths, suffixLengths, fieldTermFreqs,
542 positionIndex, positions, startOffsets, lengths,
543 payloadBytes, payloadIndex,
544 suffixBytes);
545 }
546
547
548 private int[][] positionIndex(int skip, int numFields, PackedInts.Reader numTerms, int[] termFreqs) {
549 final int[][] positionIndex = new int[numFields][];
550 int termIndex = 0;
551 for (int i = 0; i < skip; ++i) {
552 final int termCount = (int) numTerms.get(i);
553 termIndex += termCount;
554 }
555 for (int i = 0; i < numFields; ++i) {
556 final int termCount = (int) numTerms.get(skip + i);
557 positionIndex[i] = new int[termCount + 1];
558 for (int j = 0; j < termCount; ++j) {
559 final int freq = termFreqs[termIndex+j];
560 positionIndex[i][j + 1] = positionIndex[i][j] + freq;
561 }
562 termIndex += termCount;
563 }
564 return positionIndex;
565 }
566
567 private int[][] readPositions(int skip, int numFields, PackedInts.Reader flags, PackedInts.Reader numTerms, int[] termFreqs, int flag, final int totalPositions, int[][] positionIndex) throws IOException {
568 final int[][] positions = new int[numFields][];
569 reader.reset(vectorsStream, totalPositions);
570
571 int toSkip = 0;
572 int termIndex = 0;
573 for (int i = 0; i < skip; ++i) {
574 final int f = (int) flags.get(i);
575 final int termCount = (int) numTerms.get(i);
576 if ((f & flag) != 0) {
577 for (int j = 0; j < termCount; ++j) {
578 final int freq = termFreqs[termIndex+j];
579 toSkip += freq;
580 }
581 }
582 termIndex += termCount;
583 }
584 reader.skip(toSkip);
585
586 for (int i = 0; i < numFields; ++i) {
587 final int f = (int) flags.get(skip + i);
588 final int termCount = (int) numTerms.get(skip + i);
589 if ((f & flag) != 0) {
590 final int totalFreq = positionIndex[i][termCount];
591 final int[] fieldPositions = new int[totalFreq];
592 positions[i] = fieldPositions;
593 for (int j = 0; j < totalFreq; ) {
594 final LongsRef nextPositions = reader.next(totalFreq - j);
595 for (int k = 0; k < nextPositions.length; ++k) {
596 fieldPositions[j++] = (int) nextPositions.longs[nextPositions.offset + k];
597 }
598 }
599 }
600 termIndex += termCount;
601 }
602 reader.skip(totalPositions - reader.ord());
603 return positions;
604 }
605
606 private class TVFields extends Fields {
607
608 private final int[] fieldNums, fieldFlags, fieldNumOffs, numTerms, fieldLengths;
609 private final int[][] prefixLengths, suffixLengths, termFreqs, positionIndex, positions, startOffsets, lengths, payloadIndex;
610 private final BytesRef suffixBytes, payloadBytes;
611
612 public TVFields(int[] fieldNums, int[] fieldFlags, int[] fieldNumOffs, int[] numTerms, int[] fieldLengths,
613 int[][] prefixLengths, int[][] suffixLengths, int[][] termFreqs,
614 int[][] positionIndex, int[][] positions, int[][] startOffsets, int[][] lengths,
615 BytesRef payloadBytes, int[][] payloadIndex,
616 BytesRef suffixBytes) {
617 this.fieldNums = fieldNums;
618 this.fieldFlags = fieldFlags;
619 this.fieldNumOffs = fieldNumOffs;
620 this.numTerms = numTerms;
621 this.fieldLengths = fieldLengths;
622 this.prefixLengths = prefixLengths;
623 this.suffixLengths = suffixLengths;
624 this.termFreqs = termFreqs;
625 this.positionIndex = positionIndex;
626 this.positions = positions;
627 this.startOffsets = startOffsets;
628 this.lengths = lengths;
629 this.payloadBytes = payloadBytes;
630 this.payloadIndex = payloadIndex;
631 this.suffixBytes = suffixBytes;
632 }
633
634 @Override
635 public Iterator<String> iterator() {
636 return new Iterator<String>() {
637 int i = 0;
638 @Override
639 public boolean hasNext() {
640 return i < fieldNumOffs.length;
641 }
642 @Override
643 public String next() {
644 if (!hasNext()) {
645 throw new NoSuchElementException();
646 }
647 final int fieldNum = fieldNums[fieldNumOffs[i++]];
648 return fieldInfos.fieldInfo(fieldNum).name;
649 }
650 @Override
651 public void remove() {
652 throw new UnsupportedOperationException();
653 }
654 };
655 }
656
657 @Override
658 public Terms terms(String field) throws IOException {
659 final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
660 if (fieldInfo == null) {
661 return null;
662 }
663 int idx = -1;
664 for (int i = 0; i < fieldNumOffs.length; ++i) {
665 if (fieldNums[fieldNumOffs[i]] == fieldInfo.number) {
666 idx = i;
667 break;
668 }
669 }
670
671 if (idx == -1 || numTerms[idx] == 0) {
672
673 return null;
674 }
675 int fieldOff = 0, fieldLen = -1;
676 for (int i = 0; i < fieldNumOffs.length; ++i) {
677 if (i < idx) {
678 fieldOff += fieldLengths[i];
679 } else {
680 fieldLen = fieldLengths[i];
681 break;
682 }
683 }
684 assert fieldLen >= 0;
685 return new TVTerms(numTerms[idx], fieldFlags[idx],
686 prefixLengths[idx], suffixLengths[idx], termFreqs[idx],
687 positionIndex[idx], positions[idx], startOffsets[idx], lengths[idx],
688 payloadIndex[idx], payloadBytes,
689 new BytesRef(suffixBytes.bytes, suffixBytes.offset + fieldOff, fieldLen));
690 }
691
692 @Override
693 public int size() {
694 return fieldNumOffs.length;
695 }
696
697 }
698
699 private class TVTerms extends Terms {
700
701 private final int numTerms, flags;
702 private final int[] prefixLengths, suffixLengths, termFreqs, positionIndex, positions, startOffsets, lengths, payloadIndex;
703 private final BytesRef termBytes, payloadBytes;
704
705 TVTerms(int numTerms, int flags, int[] prefixLengths, int[] suffixLengths, int[] termFreqs,
706 int[] positionIndex, int[] positions, int[] startOffsets, int[] lengths,
707 int[] payloadIndex, BytesRef payloadBytes,
708 BytesRef termBytes) {
709 this.numTerms = numTerms;
710 this.flags = flags;
711 this.prefixLengths = prefixLengths;
712 this.suffixLengths = suffixLengths;
713 this.termFreqs = termFreqs;
714 this.positionIndex = positionIndex;
715 this.positions = positions;
716 this.startOffsets = startOffsets;
717 this.lengths = lengths;
718 this.payloadIndex = payloadIndex;
719 this.payloadBytes = payloadBytes;
720 this.termBytes = termBytes;
721 }
722
723 @Override
724 public TermsEnum iterator() throws IOException {
725 final TVTermsEnum termsEnum = new TVTermsEnum();
726 termsEnum.reset(numTerms, flags, prefixLengths, suffixLengths, termFreqs, positionIndex, positions, startOffsets, lengths,
727 payloadIndex, payloadBytes,
728 new ByteArrayDataInput(termBytes.bytes, termBytes.offset, termBytes.length));
729 return termsEnum;
730 }
731
732 @Override
733 public long size() throws IOException {
734 return numTerms;
735 }
736
737 @Override
738 public long getSumTotalTermFreq() throws IOException {
739 return -1L;
740 }
741
742 @Override
743 public long getSumDocFreq() throws IOException {
744 return numTerms;
745 }
746
747 @Override
748 public int getDocCount() throws IOException {
749 return 1;
750 }
751
752 @Override
753 public boolean hasFreqs() {
754 return true;
755 }
756
757 @Override
758 public boolean hasOffsets() {
759 return (flags & OFFSETS) != 0;
760 }
761
762 @Override
763 public boolean hasPositions() {
764 return (flags & POSITIONS) != 0;
765 }
766
767 @Override
768 public boolean hasPayloads() {
769 return (flags & PAYLOADS) != 0;
770 }
771
772 }
773
774 private static class TVTermsEnum extends TermsEnum {
775
776 private int numTerms, startPos, ord;
777 private int[] prefixLengths, suffixLengths, termFreqs, positionIndex, positions, startOffsets, lengths, payloadIndex;
778 private ByteArrayDataInput in;
779 private BytesRef payloads;
780 private final BytesRef term;
781
782 private TVTermsEnum() {
783 term = new BytesRef(16);
784 }
785
786 void reset(int numTerms, int flags, int[] prefixLengths, int[] suffixLengths, int[] termFreqs, int[] positionIndex, int[] positions, int[] startOffsets, int[] lengths,
787 int[] payloadIndex, BytesRef payloads, ByteArrayDataInput in) {
788 this.numTerms = numTerms;
789 this.prefixLengths = prefixLengths;
790 this.suffixLengths = suffixLengths;
791 this.termFreqs = termFreqs;
792 this.positionIndex = positionIndex;
793 this.positions = positions;
794 this.startOffsets = startOffsets;
795 this.lengths = lengths;
796 this.payloadIndex = payloadIndex;
797 this.payloads = payloads;
798 this.in = in;
799 startPos = in.getPosition();
800 reset();
801 }
802
803 void reset() {
804 term.length = 0;
805 in.setPosition(startPos);
806 ord = -1;
807 }
808
809 @Override
810 public BytesRef next() throws IOException {
811 if (ord == numTerms - 1) {
812 return null;
813 } else {
814 assert ord < numTerms;
815 ++ord;
816 }
817
818
819 term.offset = 0;
820 term.length = prefixLengths[ord] + suffixLengths[ord];
821 if (term.length > term.bytes.length) {
822 term.bytes = ArrayUtil.grow(term.bytes, term.length);
823 }
824 in.readBytes(term.bytes, prefixLengths[ord], suffixLengths[ord]);
825
826 return term;
827 }
828
829 @Override
830 public SeekStatus seekCeil(BytesRef text)
831 throws IOException {
832 if (ord < numTerms && ord >= 0) {
833 final int cmp = term().compareTo(text);
834 if (cmp == 0) {
835 return SeekStatus.FOUND;
836 } else if (cmp > 0) {
837 reset();
838 }
839 }
840
841 while (true) {
842 final BytesRef term = next();
843 if (term == null) {
844 return SeekStatus.END;
845 }
846 final int cmp = term.compareTo(text);
847 if (cmp > 0) {
848 return SeekStatus.NOT_FOUND;
849 } else if (cmp == 0) {
850 return SeekStatus.FOUND;
851 }
852 }
853 }
854
855 @Override
856 public void seekExact(long ord) throws IOException {
857 throw new UnsupportedOperationException();
858 }
859
860 @Override
861 public BytesRef term() throws IOException {
862 return term;
863 }
864
865 @Override
866 public long ord() throws IOException {
867 throw new UnsupportedOperationException();
868 }
869
870 @Override
871 public int docFreq() throws IOException {
872 return 1;
873 }
874
875 @Override
876 public long totalTermFreq() throws IOException {
877 return termFreqs[ord];
878 }
879
880 @Override
881 public final PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException {
882
883 if (PostingsEnum.featureRequested(flags, DocsAndPositionsEnum.OLD_NULL_SEMANTICS)) {
884 if (positions == null && startOffsets == null) {
885 return null;
886 }
887 }
888
889 final TVDocsEnum docsEnum;
890 if (reuse != null && reuse instanceof TVDocsEnum) {
891 docsEnum = (TVDocsEnum) reuse;
892 } else {
893 docsEnum = new TVDocsEnum();
894 }
895
896 docsEnum.reset(termFreqs[ord], positionIndex[ord], positions, startOffsets, lengths, payloads, payloadIndex);
897 return docsEnum;
898 }
899
900 }
901
902 private static class TVDocsEnum extends PostingsEnum {
903
904 private int doc = -1;
905 private int termFreq;
906 private int positionIndex;
907 private int[] positions;
908 private int[] startOffsets;
909 private int[] lengths;
910 private final BytesRef payload;
911 private int[] payloadIndex;
912 private int basePayloadOffset;
913 private int i;
914
915 TVDocsEnum() {
916 payload = new BytesRef();
917 }
918
919 public void reset(int freq, int positionIndex, int[] positions,
920 int[] startOffsets, int[] lengths, BytesRef payloads,
921 int[] payloadIndex) {
922 this.termFreq = freq;
923 this.positionIndex = positionIndex;
924 this.positions = positions;
925 this.startOffsets = startOffsets;
926 this.lengths = lengths;
927 this.basePayloadOffset = payloads.offset;
928 this.payload.bytes = payloads.bytes;
929 payload.offset = payload.length = 0;
930 this.payloadIndex = payloadIndex;
931
932 doc = i = -1;
933 }
934
935 private void checkDoc() {
936 if (doc == NO_MORE_DOCS) {
937 throw new IllegalStateException("DocsEnum exhausted");
938 } else if (doc == -1) {
939 throw new IllegalStateException("DocsEnum not started");
940 }
941 }
942
943 private void checkPosition() {
944 checkDoc();
945 if (i < 0) {
946 throw new IllegalStateException("Position enum not started");
947 } else if (i >= termFreq) {
948 throw new IllegalStateException("Read past last position");
949 }
950 }
951
952 @Override
953 public int nextPosition() throws IOException {
954 if (doc != 0) {
955 throw new IllegalStateException();
956 } else if (i >= termFreq - 1) {
957 throw new IllegalStateException("Read past last position");
958 }
959
960 ++i;
961
962 if (payloadIndex != null) {
963 payload.offset = basePayloadOffset + payloadIndex[positionIndex + i];
964 payload.length = payloadIndex[positionIndex + i + 1] - payloadIndex[positionIndex + i];
965 }
966
967 if (positions == null) {
968 return -1;
969 } else {
970 return positions[positionIndex + i];
971 }
972 }
973
974 @Override
975 public int startOffset() throws IOException {
976 checkPosition();
977 if (startOffsets == null) {
978 return -1;
979 } else {
980 return startOffsets[positionIndex + i];
981 }
982 }
983
984 @Override
985 public int endOffset() throws IOException {
986 checkPosition();
987 if (startOffsets == null) {
988 return -1;
989 } else {
990 return startOffsets[positionIndex + i] + lengths[positionIndex + i];
991 }
992 }
993
994 @Override
995 public BytesRef getPayload() throws IOException {
996 checkPosition();
997 if (payloadIndex == null || payload.length == 0) {
998 return null;
999 } else {
1000 return payload;
1001 }
1002 }
1003
1004 @Override
1005 public int freq() throws IOException {
1006 checkDoc();
1007 return termFreq;
1008 }
1009
1010 @Override
1011 public int docID() {
1012 return doc;
1013 }
1014
1015 @Override
1016 public int nextDoc() throws IOException {
1017 if (doc == -1) {
1018 return (doc = 0);
1019 } else {
1020 return (doc = NO_MORE_DOCS);
1021 }
1022 }
1023
1024 @Override
1025 public int advance(int target) throws IOException {
1026 return slowAdvance(target);
1027 }
1028
1029 @Override
1030 public long cost() {
1031 return 1;
1032 }
1033 }
1034
1035 private static int sum(int[] arr) {
1036 int sum = 0;
1037 for (int el : arr) {
1038 sum += el;
1039 }
1040 return sum;
1041 }
1042
1043 @Override
1044 public long ramBytesUsed() {
1045 return indexReader.ramBytesUsed();
1046 }
1047
1048 @Override
1049 public Collection<Accountable> getChildResources() {
1050 return Collections.singleton(Accountables.namedAccountable("term vector index", indexReader));
1051 }
1052
1053 @Override
1054 public void checkIntegrity() throws IOException {
1055 if (version >= VERSION_CHECKSUM) {
1056 CodecUtil.checksumEntireFile(vectorsStream);
1057 }
1058 }
1059
1060 @Override
1061 public String toString() {
1062 return getClass().getSimpleName() + "(mode=" + compressionMode + ",chunksize=" + chunkSize + ")";
1063 }
1064 }